library(MASS)
#library(gplots)
library(ggplot2)
library(Hmisc)
library(stringr)
#library(sciplot)
#library(RItools)
#library(aod)
library(stm)
library(dplyr)
library(tidyr)
library(fastDummies)
library(stringr)
library(reshape2)
library(lessR)
##############################################################
#	

set.seed(44)
remove(list=ls())
setwd("/Users/robertchaudoin/Dropbox/Kill_Scrapes/Media coverage data/classified_from_2019_06_10/")

###
# This file shows the data setup code that was used to insheet the raw data from the Cline Center (the solr_`i' objects)
#	and the hand coded data (the hc_`i' objects).
# This is the code that created the articles_2019_07_16.RData file, which is the first thing that gets loaded to do the STM analysis.
# This code is also used to calculate the accuracy scores referenced in the main manuscript and appendix.
# It also creates some ancillary files that helped show which sources were available for which dates.  Accuracy and source data are in Appendix A.



# Insheeting the hand coded data
setwd("/Users/robertchaudoin/Dropbox/Kill_Scrapes/Media coverage data/classified_from_2019_06_10/analysis_files/data/hand_coded/")


hc_1 <- read.csv("sample_2017.04.09-2017.06.17_sccoded.csv")
	hc_1 <- hc_1 %>% select(aid_long, target)
hc_2 <- read.csv("sample_2017.06.18-2017.08.26_sccoded.csv")
	hc_2 <- hc_2 %>% select(aid_long, target)
hc_3 <- read.csv("sample_2017.08.27-2017.10.25_sccoded.csv")
	hc_3 <- hc_3 %>% select(aid_long, target)
hc_4 <- read.csv("sample_2017.10.26-2018.01.03_sccoded.csv")
	hc_4 <- hc_4 %>% select(aid_long, target)
hc_5 <- read.csv("sample_2018.01.04-2018.03.14_sccoded.csv")
	hc_5 <- hc_5 %>% select(aid_long, target)
hc_6 <- read.csv("sample_2018.03.15-2018.05.23_sccoded.csv")
	hc_6 <- hc_6 %>% select(aid_long, target)
hc_7 <- read.csv("sample_2018.05.24-2018.08.01_sccoded.csv")
	hc_7 <- hc_7 %>% select(aid_long, target)
hc_8 <- read.csv("sample_2018.08.02-2018.10.10_sccoded.csv")
	hc_8 <- hc_8 %>% select(aid_long, target)
hc_9 <- read.csv("sample_2018.10.11-2018.12.20_sccoded.csv")
	hc_9 <- hc_9 %>% select(aid_long, target)

hc <- rbind(hc_1,hc_2,hc_3,hc_4,hc_5,hc_6,hc_7,hc_8,hc_9)
rm(list = ls()[grepl("hc_", ls())])


### Insheeting and prepping the solr results
setwd("/Users/robertchaudoin/Dropbox/Kill_Scrapes/Media coverage data/classified_from_2019_06_10/analysis_files/data/")

solr_1 <- read.csv("solr_2017--04-09--2018-12-20.csv")
	solr_1 <- subset(solr_1, select=-c(X,Unnamed..0))
	colnames(solr_1)[colnames(solr_1)=="title"] <- "title_solr_1"
	solr_1$title_solr_2 <- "NA"
solr_2 <- read.csv("solr_2016-01-01--2017-04-08.csv")
	solr_2 <- subset(solr_2, select=-c(source_host))
	colnames(solr_2)[colnames(solr_2)=="content"] <- "data"
	colnames(solr_2)[colnames(solr_2)=="title"] <- "title_solr_2"
	solr_2$aid_long <- as.character(solr_2$aid)
	solr_2$aid_long <- paste0(solr_2$aid_long,"_long")
	solr_2$title_solr_1 <- "NA"
	
solr <- rbind(solr_1,solr_2)
#rm(list = ls()[grepl("solr_", ls())])

# Dates
# Eg 10/26/17 01:23 AM
solr <- solr %>% separate(publication_date, c("date1","date2","date3"), sep = " ")
solr$pub_date_fmt <- as.Date(solr$date1)

solr <- solr %>% separate(date1, c("year","month","day"), sep = "-")

solr$publication_integer <- paste(solr$year,solr$month,solr$day,sep="")
solr$publication_integer <- as.numeric(solr$publication_integer)

solr$day <- str_remove(solr$day, "^0+")
solr$day <- as.numeric(solr$day)
solr$month <- str_remove(solr$month, "^0+")
solr$month <- as.numeric(solr$month)
solr$year <- as.numeric(solr$year)

# Source names
solr$baseurl <- str_remove(solr$source_url, "^http://+")
solr$baseurl <- str_remove(solr$baseurl, "^https://+")
solr$baseurl <- gsub("www.","",solr$baseurl)
solr$baseurl <- gsub("cebudailynews.","",solr$baseurl)
solr$baseurl <- gsub("entertainment.","",solr$baseurl)
solr$baseurl <- gsub("sports.","",solr$baseurl)
solr$baseurl <- gsub("usa.","",solr$baseurl)
solr$baseurl <- gsub("newsinfo.","",solr$baseurl)
solr$baseurl <- gsub("business.","",solr$baseurl)
solr$baseurl <- gsub("globalnation.","",solr$baseurl)
solr$baseurl <- gsub("lifestyle.","",solr$baseurl)
solr$baseurl <- gsub("news.","",solr$baseurl)
solr$baseurl <- gsub("technology.","",solr$baseurl)
solr$baseurl <- gsub("feeds.","",solr$baseurl)

solr <- solr %>% separate(baseurl, "source_name", sep = "\\.")

# There are a lot of wrong sources in here
solr <- solr[ which(solr$source_name == "manilatimes" | solr$source_name == "abs-cbn" | solr$source_name == "abs-cbncom:80/feed" |
	solr$source_name == "philippinetimes" | solr$source_name == "inquirer" | solr$source_name == "gmanetwork" | solr$source_name == "bworldonline" |
	solr$source_name == "interaksyon" | solr$source_name == "philstar" | solr$source_name == "sunstar" | solr$source_name == "manilastandardtoday"), ]

solr$source_name[which(solr$source_name == "abs-cbncom:80/feed")] = "abs-cbn"

# Removing some junk text     
solr$data <- gsub("Email:","",solr$data)
solr$data <- gsub("SunStar Philippines","",solr$data)
solr$data <- gsub("SunStar Bacolod","",solr$data)
solr$data <- gsub("SunStar Davao","",solr$data)
solr$data <- gsub("SunStar Cebu","",solr$data)
solr$data <- gsub("newsdesk@manilatimes.net","",solr$data)
solr$data <- gsub("opinion@manilatimes.net","",solr$data)
solr$data <- gsub("also available on your mobile phones, laptops, and tablets","",solr$data)
solr$data <- gsub("latest issues of sunstar","",solr$data)
solr$data <- gsub("address:","",solr$data)
solr$data <- gsub("2/f ","",solr$data)
solr$data <- gsub("sitio grande building","",solr$data)
solr$data <- gsub("409 a. soriano avenue","",solr$data)
solr$data <- gsub("soriano avenue","",solr$data)
solr$data <- gsub("intramuros manila","",solr$data)
solr$data <- gsub("philippines tel. ","",solr$data)
solr$data <- gsub("fax:","",solr$data)
solr$data <- gsub("email:","",solr$data)
solr$data <- gsub("published in the sunstar","",solr$data)
solr$data <- gsub("disclaimer: the ","",solr$data)
solr$data <- gsub("comments uploaded on this site do not necessarily represent or reflect the views of management and owner of cebudailynews","",solr$data)
solr$data <- gsub("we reserve the right to exclude comments that we deem to be inconsistent with our editorial standards","",solr$data)

# Removing authors
solr$data <- gsub("Ramon Efren Lazaro","",solr$data)
solr$data <- gsub("Raymund Catindig","",solr$data)
solr$data <- gsub("Jennifer Rendon","",solr$data)
solr$data <- gsub("Ed Amoroso","",solr$data)
solr$data <- gsub("Emmanuel Tupas","",solr$data)
solr$data <- gsub("Francis Elevado","",solr$data)
solr$data <- gsub("Eva Visperas","",solr$data)
solr$data <- gsub("Ric Sapnu","",solr$data)
solr$data <- gsub("Eugene Y. Adiong","",solr$data)
solr$data <- gsub("Rigoberto Tiglao","",solr$data)
solr$data <- gsub("tiglao.manilatimes@gmail.com Facebook","",solr$data)
solr$data <- gsub("fstatad@gmail.com","",solr$data)
solr$data <- gsub("rachelagreyes@gmail.com","",solr$data)
solr$data <- gsub("saestremera@gmail.com","",solr$data)

	# diagnostic tool for identifying junk text
	# d[ which(regexpr("gsp",d$data, ignore.case = TRUE) != -1), ][1:3,]

# Removing junk articles that have a bunch of common names
solr <- solr[ which(regexpr("licensure examination given by",solr$data, ignore.case = TRUE) == -1), ]


# Token combining
solr$data <- str_replace_all(solr$data,regex("human rights watch", ignore_case = TRUE),"humanrightswatch")
solr$data <- str_replace_all(solr$data,regex("hrw", ignore_case = TRUE),"humanrightswatch")
solr$data <- str_replace_all(solr$data,regex("human rights", ignore_case = TRUE),"humanrights")
solr$data <- str_replace_all(solr$data,regex("war on drugs", ignore_case = TRUE),"warondrugs")
solr$data <- str_replace_all(solr$data,regex("International Criminal Court", ignore_case = TRUE),"ICC")
solr$data <- str_replace_all(solr$data,regex("Kian Lloyd delos Santos", ignore_case = TRUE),"KianLloydDelosSantos")
solr$data <- str_replace_all(solr$data,regex("Kian Lloyd Santos", ignore_case = TRUE),"KianLloydDelosSantos")
solr$data <- str_replace_all(solr$data,"Dela Rosa","RonaldBatoDelaRosa")
solr$data <- str_replace_all(solr$data,"Ronald Bato dela Rosa","RonaldBatoDelaRosa")
solr$data <- str_replace_all(solr$data,regex("Ronald Dela Rosa", ignore_case = TRUE),"RonaldBatoDelaRosa")
solr$data <- str_replace_all(solr$data,regex("Leila De Lima", ignore_case = TRUE),"LeilaDeLima")
solr$data <- str_replace_all(solr$data,"SWS","SocialWeatherStations")
solr$data <- str_replace_all(solr$data,"Social Weather Stations","SocialWeatherStations")

# Insheeting the classifier data
setwd("/Users/robertchaudoin/Dropbox/Kill_Scrapes/Media coverage data/classified_from_2019_06_10/analysis_files/data/")
classifier <- read.csv("classifier_results.csv")
	colnames(classifier)[colnames(classifier)=="title"] <- "title_classifier"


# Merging

solr$insolr <- 1
hc$inhc <- 1
classifier$inclassifier <- 1

d1 <- merge(solr, hc, by = "aid_long", all.x = TRUE, all.y = TRUE, quiet = FALSE)
# rows that aren't hand coded or aren't in solr
#	dim(d1[is.na(d1$inhc),])		### rows that weren't hand coded, that's ok
#	dim(d1[is.na(d1$insolr),])

d2 <- merge(d1, classifier, by = "aid_long", all.x = TRUE, all.y = TRUE, quiet = FALSE)
	dim(d2[is.na(d2$inclassifier),])
	dim(d2[is.na(d2$insolr),])

### Limiting down to the "good" time range: 9/10/17 - 4/1/18 
d3 <- subset(d2, pub_date_fmt >= as.numeric(as.Date("2017-09-10")))
d3 <- subset(d3, pub_date_fmt <= as.numeric(as.Date("2018-04-01")))
	#	dim(d3[is.na(d3$inclassifier),])
	#	table(d3[is.na(d3$inclassifier),]$pub_date_fmt)		### they're all on 4-1-18, so it's an end boundary inclusion thing

### Limiting down to the ``good'' sources
d3 <- d3[ which(d3$source_name == "manilatimes" | d3$source_name == "interaksyon" | d3$source_name == "sunstar" | d3$source_name == "inquirer"), ]

# Dummy columns for sources
d3 <- dummy_cols(d3, select_columns = "source_name")

# Dummy variable for whether drug is in the article
d3$drugtest <- regexpr("drug",d3$data, ignore.case = TRUE)
d3$drug <- ifelse(d3$drugtest != -1,1,0)

# Write CSV to make source specific green bar - red bar pictures in Stata
write.csv(d3 %>% select(aid_long, source_url, year, month, day, aid_long, pub_date_fmt, publication_integer, source_name, drug, before_coding), "sources_of20190610_on_2019_07_16.csv")

	# This part added 9-16-2021; removes the "good time window" limitation
		d2b <- d2[ which(d2$source_name == "manilatimes" | d2$source_name == "interaksyon" | d2$source_name == "sunstar" | d2$source_name == "inquirer"), ]
		d2b$drugtest <- regexpr("drug",d2b$data, ignore.case = TRUE)
		d2b$drug <- ifelse(d2b$drugtest != -1,1,0)

		write.csv(d2b %>% select(aid_long, source_url, year, month, day, aid_long, pub_date_fmt, publication_integer, source_name, drug, before_coding), "sources_of20190610_on_2021_09_16_alldates.csv")

# After ICC dummy variable
d3$aftericc <- ifelse(d3$publication_integer >= 20180208, 1, 0)


###
# Cleaning and saving the working data
###

d <- d3
#rm("classifier","d1","d2","d3","hc","solr","solr_1","solr_2")
save(d, file = "articles_2019_07_16.RData")

# Creating an out of sample random sampling of articles to hand coded, getting accuracy scores for the classifier.
# all the ones with drug that are relevant == 0			--- 1222 (in the full dataset, not excluding "inhc == 1 " articles.)
# all the ones without drug that are relevant == 1		--- 2510 (in the full dataset, not excluding "inhc == 1 " articles.)
# random sample of relevant == 1						--- 
# random sample of relevant == 0						---

table(d$drug,d$before_coding)

# Don't need to run the subsetted commands again.  These .csv's were saved and coded around 8/12/19.
tocode_rel0_drug1 <- d[ which(d$before_coding == 0 & d$drug == 1), ]
tocode_rel0_drug1 <- tocode_rel0_drug1[is.na(tocode_rel0_drug1$inhc), ]
tocode_rel0_drug1 <- tocode_rel0_drug1[sample(nrow(tocode_rel0_drug1), 671), ]		# Taking all 671 that haven't already been coded
#save(tocode_rel0_drug1, file = "tocode_rel0_drug1.RData")
#write.csv(tocode_rel0_drug1[,c("aid_long","aid","data","url","title_solr_1","pub_date_fmt","inhc","before_coding","drug")], "tocode_rel0_drug1.csv")

tocode_rel1_drug0 <- d[ which(d$before_coding == 1 & d$drug == 0), ]
tocode_rel1_drug0 <- tocode_rel1_drug0[is.na(tocode_rel1_drug0$inhc), ]
tocode_rel1_drug0 <- tocode_rel1_drug0[sample(nrow(tocode_rel1_drug0), 800), ]		# Taking a random sample here.  There are 2510 here; none have been hand coded yet bc no "drug*"
#save(tocode_rel1_drug0, file = "tocode_rel1_drug0.RData")
#write.csv(tocode_rel1_drug0[,c("aid_long","aid","data","url","title_solr_1","pub_date_fmt","inhc","before_coding","drug")], "tocode_rel1_drug0.csv")

tocode_rel1 <- d[ which(d$before_coding == 1 & d$drug == 1), ]
tocode_rel1 <- tocode_rel1[is.na(tocode_rel1$inhc), ]
tocode_rel1 <- tocode_rel1[sample(nrow(tocode_rel1), 800), ]						# Taking a random sample.  There are 1379 that haven't already been hand coded.
#save(tocode_rel1, file = "tocode_rel1.RData")
#write.csv(tocode_rel1[,c("aid_long","aid","data","url","title_solr_1","pub_date_fmt","inhc","before_coding","drug")], "tocode_rel1.csv")

tocode_rel0 <- d[ which(d$before_coding == 0 & d$drug == 0), ]
tocode_rel0 <- tocode_rel0[is.na(tocode_rel0$inhc), ]
tocode_rel0 <- tocode_rel0[sample(nrow(tocode_rel0), 800), ]						# Taking a random sample.  There are 27K here.
#save(tocode_rel0, file = "tocode_rel0.RData")
#write.csv(tocode_rel0[,c("aid_long","aid","data","url","title_solr_1","pub_date_fmt","inhc","before_coding","drug")], "tocode_rel0.csv")

# Fully random draw, no subsetting
d_notinhc <- d[is.na(d$inhc), ]
tocode_full <- d_notinhc[sample(nrow(d_notinhc), 2000), ]						# 2000 draws, not coded pre-classifier.

coded_rel0_drug1 <- read.csv("tocode_rel0_drug1_sccoded.csv")[ ,c('aid_long', 'oos_hand_code')]
coded_rel1_drug0 <- read.csv("tocode_rel1_sccoded.csv")[ ,c('aid_long', 'oos_hand_code')]

tocode_full_wprevcode <- merge(tocode_full, coded_rel0_drug1, by = "aid_long", all.x = TRUE, all.y = FALSE)
tocode_full_wprevcode <- merge(tocode_full_wprevcode, coded_rel1_drug0, by = "aid_long", all.x = TRUE, all.y = FALSE)
#write.csv(tocode_full_wprevcode[,c("aid_long","aid","data","url","title_solr_1","pub_date_fmt","inhc","before_coding","drug","oos_hand_code.x","oos_hand_code.y")], "tocode_full_wprevcode.csv")

### Accuracy scores
#setwd("/Users/robertchaudoin/Dropbox/Kill_Scrapes/Media coverage data/classified_from_2019_06_10/analysis_files/data/")
accuracy_scores <- read.csv("tocode_full_wprevcode_sccoded.csv")[ ,c('aid_long', 'oos_hand_code','assmed','before_coding','drug','title_solr_1','pub_date_fmt')]

acc1 <- table(accuracy_scores$before_coding,accuracy_scores$oos_hand_code)
acc1
#Accuracy
(acc1[1,1]+acc1[2,2])/nrow(accuracy_scores)
# Precision
acc1[2,2]/(acc1[2,2]+acc1[2,1])
# Recall
acc1[2,2]/(acc1[2,2]+acc1[1,2])
# F1
(2*(acc1[2,2]/(acc1[2,2]+acc1[2,1]))*(acc1[2,2]/(acc1[2,2]+acc1[1,2])))/((acc1[2,2]/(acc1[2,2]+acc1[2,1])) + (acc1[2,2]/(acc1[2,2]+acc1[1,2])))


accuracy_scores$before_coding2 <- ifelse(accuracy_scores$before_coding == 1 & accuracy_scores$drug == 0, 0, accuracy_scores$before_coding)
table(accuracy_scores$before_coding2,accuracy_scores$oos_hand_code)
#	creates some false negatives (+10) but lowers false positives (-135)

acc2 <- table(accuracy_scores$before_coding2,accuracy_scores$oos_hand_code)
acc2
#Accuracy
(acc2[1,1]+acc2[2,2])/nrow(accuracy_scores)
# Precision
acc2[2,2]/(acc2[2,2]+acc2[2,1])
# Recall
acc2[2,2]/(acc2[2,2]+acc2[1,2])
# F1
(2*(acc2[2,2]/(acc2[2,2]+acc2[2,1]))*(acc2[2,2]/(acc2[2,2]+acc2[1,2])))/((acc2[2,2]/(acc2[2,2]+acc2[2,1])) + (acc2[2,2]/(acc2[2,2]+acc2[1,2])))

# Response to R3 question about accuracy and post-treatment data
library(lubridate)
accuracy_scores$pub_date_ymd <- mdy(accuracy_scores$pub_date_fmt)

# sanity checks on code
#dim(accuracy_scores)
#dim(accuracy_scores[ which(accuracy_scores$pub_date_ymd <= "2017-10-23"), ])

accuracy_scores_beforeicc <- accuracy_scores[ which(accuracy_scores$pub_date_ymd < "2018-02-08"), ]
accuracy_scores_aftericc <- accuracy_scores[ which(accuracy_scores$pub_date_ymd >= "2018-02-08"), ]

acc2_beforeicc <- table(accuracy_scores_beforeicc$before_coding2, accuracy_scores_beforeicc$oos_hand_code)
acc2_beforeicc
#Accuracy
(acc2_beforeicc[1,1]+acc2_beforeicc[2,2])/nrow(accuracy_scores_beforeicc)
# Precision
acc2_beforeicc[2,2]/(acc2_beforeicc[2,2]+acc2_beforeicc[2,1])
# Recall
acc2_beforeicc[2,2]/(acc2_beforeicc[2,2]+acc2_beforeicc[1,2])
# F1
(2*(acc2_beforeicc[2,2]/(acc2_beforeicc[2,2]+acc2_beforeicc[2,1]))*(acc2_beforeicc[2,2]/(acc2_beforeicc[2,2]+acc2_beforeicc[1,2])))/((acc2_beforeicc[2,2]/(acc2_beforeicc[2,2]+acc2_beforeicc[2,1])) + (acc2_beforeicc[2,2]/(acc2_beforeicc[2,2]+acc2_beforeicc[1,2])))

acc2_aftericc <- table(accuracy_scores_aftericc$before_coding2, accuracy_scores_aftericc$oos_hand_code)
acc2_aftericc
#Accuracy
(acc2_aftericc[1,1]+acc2_aftericc[2,2])/nrow(accuracy_scores_aftericc)
# Precision
acc2_aftericc[2,2]/(acc2_aftericc[2,2]+acc2_aftericc[2,1])
# Recall
acc2_aftericc[2,2]/(acc2_aftericc[2,2]+acc2_aftericc[1,2])
# F1
(2*(acc2_aftericc[2,2]/(acc2_aftericc[2,2]+acc2_aftericc[2,1]))*(acc2_aftericc[2,2]/(acc2_aftericc[2,2]+acc2_aftericc[1,2])))/((acc2_aftericc[2,2]/(acc2_aftericc[2,2]+acc2_aftericc[2,1])) + (acc2_aftericc[2,2]/(acc2_aftericc[2,2]+acc2_aftericc[1,2])))
